{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from pycocotools.coco import COCO\n",
    "import layoutparser as lp\n",
    "import random\n",
    "import cv2\n",
    "import os\n",
    "import openpyxl\n",
    "from pathlib import Path\n",
    "import subprocess\n",
    "from collections import defaultdict\n",
    "import pandas as pd\n",
    "import re\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Figure out working directory\n",
    "os.getcwd()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Make sure directory points to the highest level of the docker container, or the folder connected to \"Replication Attempt 1\"\n",
    "os.chdir(\"..\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create lists of cities and training data size to iterate over\n",
    "cities = [\"Santa Rosa\",\"Temecula\",\"South San Francisco\",\"Visalia\",\"Chula Vista\"]\n",
    "meetings = [\"One Meeting\", \"Two Meetings\",\"Three Meetings\",\"Four Meetings\",\"Five Meetings\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# For each training data size (Meetings per year included) for each city, create data extraction folders\n",
    "for city in cities:\n",
    "    for meeting in meetings:\n",
    "        folder_path = Path(\"Step 4/Data Extraction/\"+city+\"/\"+meeting)\n",
    "        folder_path.mkdir(parents=True, exist_ok=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Set the confidence thresholds to be used and subset the list of cities to those that have true PDFs as meeting records. \n",
    "# This allows for a simplified data extraction process\n",
    "cities = [\"Santa Rosa\",\"Temecula\",\"Visalia\",\"Chula Vista\"]\n",
    "thresholds = [.5,.6,.7,.8,.9]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# For the cities in the subset, open each pdf, convert each page to an image, then use the correct fine tuned model to extract agenda items and save to a string.\n",
    "# Finally, save output as excel file\n",
    "for city in cities:\n",
    "    pdfs = []\n",
    "    folder_path = Path(f\"Step 1/Raw PDFS/{city}/\")\n",
    "    subfolders = [r.name for r in os.scandir(folder_path) if r.is_dir()]\n",
    "    for subfolder in subfolders:\n",
    "        files = os.listdir(Path(f\"Step 1/Raw PDFS/{city}/{subfolder}/\"))\n",
    "        for file in files:\n",
    "            full = Path(f\"Step 1/Raw PDFS/{city}/{subfolder}/{file}\")\n",
    "            pdfs.append(full)\n",
    "    pdf_bibs = defaultdict(list)\n",
    "    pdf_dates = list()\n",
    "    pdf_text = list()\n",
    "    model_type = list()\n",
    "    for threshold in thresholds:\n",
    "        model = lp.Detectron2LayoutModel(\n",
    "        config_path = f\"Step 3/Model Output/{city}/Two Meetings/From Manuscript/config.yaml\",\n",
    "        model_path = f\"Step 3/Model Output/{city}/Two Meetings/From Manuscript/model_final.pth\",\n",
    "        extra_config = [\"MODEL.ROI_HEADS.SCORE_THRESH_TEST\", threshold] # <-- Only output high accuracy preds\n",
    "        )\n",
    "        for meeting in pdfs:\n",
    "            file = meeting\n",
    "            pdf_tokens, pdf_images = lp.load_pdf(file, load_images=True)\n",
    "            for page_index in range(len(pdf_images)): #Reference Section \n",
    "                bib_items = model.detect(pdf_images[page_index])\n",
    "                for bib_item in bib_items:\n",
    "                    bib_tokens = pdf_tokens[page_index].filter_by(bib_item, center=True)\n",
    "                    bib_text = \" \".join(bib_tokens.get_texts())\n",
    "                    pdf_text.append(bib_text)\n",
    "                    pdf_dates.append(meeting)\n",
    "                    model_type.append(threshold)\n",
    "            print(meeting)\n",
    "            print(threshold)\n",
    "\n",
    "    d = {'text':pdf_text,'Day':pdf_dates,'CT':model_type}\n",
    "    testdf3= pd.DataFrame(d)\n",
    "    testdf3.to_excel(Path(f\"Step 4/Data Extraction/{city}/Two Meetings/{city}_raw.xlsx\"))\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Subset cities to those that require the use of OCR to extract agenda items rather than True PDF tokens\n",
    "cities = [\"South San Francisco\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Import additional packages required for OCR text extraction\n",
    "import layoutparser.ocr as ocr\n",
    "import pdf2image\n",
    "import numpy as np\n",
    "ocr_agent = ocr.TesseractAgent(languages='eng')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# OCR text extraction can create characters that are invalid when saving an excel file. These two functions clean those characters out of the extracted text\n",
    "def clean_text_for_xml(text):\n",
    "    \"\"\"Removes or replaces invalid XML characters from a string.\"\"\"\n",
    "    if isinstance(text, str):\n",
    "        # Replace control characters (excluding tab, newline, carriage return) with a space\n",
    "        cleaned_text = re.sub(r\"[\\x00-\\x08\\x0b\\x0c\\x0e-\\x1f]\", \" \", text)\n",
    "        return cleaned_text\n",
    "    return text\n",
    "\n",
    "def clean_dataframe_for_excel(df):\n",
    "    \"\"\"Applies clean_text_for_xml to all string columns in a DataFrame.\"\"\"\n",
    "    for col in df.select_dtypes(include='object').columns:\n",
    "        df[col] = df[col].apply(clean_text_for_xml)\n",
    "    return df\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# For the cities in the subset, open each pdf, convert each page to an image, then use the correct fine tuned model to extract agenda items and save to a string.\n",
    "# Finally, save output as excel file\n",
    "for city in cities:\n",
    "    pdfs = []\n",
    "    folder_path = Path(f\"Step 1/Raw PDFS/{city}/\")\n",
    "    subfolders = [r.name for r in os.scandir(folder_path) if r.is_dir()]\n",
    "    for subfolder in subfolders:\n",
    "        files = os.listdir(Path(f\"Step 1/Raw PDFS/{city}/{subfolder}/\"))\n",
    "        for file in files:\n",
    "            full = Path(f\"Step 1/Raw PDFS/{city}/{subfolder}/{file}\")\n",
    "            pdfs.append(full)\n",
    "    for threshold in thresholds:\n",
    "        pdf_bibs2 = defaultdict(list)\n",
    "        pdf_dates2 = list()\n",
    "        pdf_text2 = list()\n",
    "        model_type = list()\n",
    "        model = lp.Detectron2LayoutModel(\n",
    "        config_path = f\"Step 3/Model Output/{city}/Two Meetings/From Manuscript/config.yaml\",\n",
    "        model_path = f\"Step 3/Model Output/{city}/Two Meetings/From Manuscript/model_final.pth\",\n",
    "        extra_config = [\"MODEL.ROI_HEADS.SCORE_THRESH_TEST\", threshold] # <-- Only output high accuracy preds\n",
    "        )\n",
    "        for meeting in pdfs:\n",
    "            file = meeting\n",
    "            try:\n",
    "                img1 = np.asarray(pdf2image.convert_from_path(file), dtype=\"object\")\n",
    "                for page_index in range(len(img1)): #Reference Section\n",
    "                    img2 = np.asarray(pdf2image.convert_from_path(file)[page_index]) \n",
    "                    layout_result = model.detect(img2)\n",
    "                    text_blocks = lp.Layout([b for b in layout_result if b.type==0])\n",
    "                    for block in text_blocks:\n",
    "\n",
    "                # Crop image around the detected layout\n",
    "                        segment_image = (block\n",
    "                            .pad(left=15, right=15, top=5, bottom=5)\n",
    "                            .crop_image(img2))\n",
    "    \n",
    "                # Perform OCR\n",
    "                        text = ocr_agent.detect(segment_image)\n",
    "\n",
    "                # Save OCR result\n",
    "                        block.set(text=text, inplace=True)\n",
    "                    print(page_index)\n",
    "                    for txt in text_blocks:\n",
    "                        pdf_text2.append(txt.text)\n",
    "                        pdf_dates2.append(meeting)\n",
    "                        model_type.append(threshold)\n",
    "                print(meeting)\n",
    "                print(threshold)\n",
    "            except:\n",
    "                print(\"single page\")\n",
    "                img5 = np.dstack(pdf2image.convert_from_path(file))\n",
    "                layout_result = model.detect(img5)\n",
    "                text_blocks = lp.Layout([b for b in layout_result if b.type==0])\n",
    "                for block in text_blocks:\n",
    "\n",
    "            # Crop image around the detected layout\n",
    "                    segment_image = (block\n",
    "                        .pad(left=15, right=15, top=5, bottom=5)\n",
    "                        .crop_image(img5))\n",
    "    \n",
    "            # Perform OCR\n",
    "                    text = ocr_agent.detect(segment_image)\n",
    "\n",
    "            # Save OCR result\n",
    "                    block.set(text=text, inplace=True)\n",
    "                print(page_index)\n",
    "                for txt in text_blocks:\n",
    "                    pdf_text2.append(txt.text)\n",
    "                    pdf_dates2.append(meeting)\n",
    "                    model_type.append(threshold)\n",
    "                print(meeting)\n",
    "                print(threshold)\n",
    "\n",
    "\n",
    "        d2 = {'text':pdf_text2,'Day':pdf_dates2,'CT':model_type}\n",
    "        testdf3b= pd.DataFrame(d2)\n",
    "        df_cleaned = clean_dataframe_for_excel(testdf3b.copy())\n",
    "        df_cleaned.to_excel(Path(f\"Step 4/Data Extraction/{city}/Two Meetings/{city}_raw_{threshold}.xlsx\"))"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
